In [56]:
import pandas as pd
data=pd.read_excel('Capstone_final_dataset.xlsx')
data.head()
Out[56]:
time0_year first_tx race black SEX age above74 age_cat ABIRATERONE enza ... a1c_cat cindex_b_Romano char_cat Total_Elixhauser_Groups DOCETAXEL_bf_time0 fi_score_cat dis_number fi_score frail gleason_reviewed
0 2017 ENZALUTA White 0 M 70 0 3 0 1 ... 1. <5.6 2 1 3 0 2. pre-f 4 0.129032 0 9
1 2015 ABIRATER White 0 M 68 0 2 1 0 ... 1. <5.6 4 2 7 0 1. non-f 3 0.096774 0 7
2 2015 ABIRATER White 0 M 71 0 3 1 0 ... 1. <5.6 3 1 5 0 1. non-f 3 0.096774 0
3 2015 ABIRATER White 0 M 79 1 3 1 0 ... 4. >=7.2 7 3 10 0 3. mild 9 0.290323 1 9
4 2016 ABIRATER White 0 M 90 1 5 1 0 ... NaN 3 1 6 0 4. moder 10 0.322581 1

5 rows × 32 columns

In [57]:
data.shape
Out[57]:
(5822, 32)
In [58]:
data.isnull().sum()
Out[58]:
0
time0_year 0
first_tx 0
race 0
black 0
SEX 0
age 0
above74 0
age_cat 0
ABIRATERONE 0
enza 0
first_tx_last_day_Supply 0
days_bt_first_last_prescrib 0
first_tx_daysSupply_sum 0
fu_end_date_year 0
death 0
crcl_cat 0
albumin_cat 0
bilirubin_cat 0
hgb_cat 0
psa_cat 0
PSACAT 0
BMI_cat 0
a1c_cat 2401
cindex_b_Romano 0
char_cat 0
Total_Elixhauser_Groups 0
DOCETAXEL_bf_time0 0
fi_score_cat 0
dis_number 0
fi_score 0
frail 0
gleason_reviewed 0

In [59]:
data.drop('a1c_cat', axis=1, inplace=True)
In [60]:
data.head()
Out[60]:
time0_year first_tx race black SEX age above74 age_cat ABIRATERONE enza ... BMI_cat cindex_b_Romano char_cat Total_Elixhauser_Groups DOCETAXEL_bf_time0 fi_score_cat dis_number fi_score frail gleason_reviewed
0 2017 ENZALUTA White 0 M 70 0 3 0 1 ... 2 2 1 3 0 2. pre-f 4 0.129032 0 9
1 2015 ABIRATER White 0 M 68 0 2 1 0 ... 4 4 2 7 0 1. non-f 3 0.096774 0 7
2 2015 ABIRATER White 0 M 71 0 3 1 0 ... 4 3 1 5 0 1. non-f 3 0.096774 0
3 2015 ABIRATER White 0 M 79 1 3 1 0 ... 2 7 3 10 0 3. mild 9 0.290323 1 9
4 2016 ABIRATER White 0 M 90 1 5 1 0 ... 4 3 1 6 0 4. moder 10 0.322581 1

5 rows × 31 columns

In [61]:
data.isnull().sum()
Out[61]:
0
time0_year 0
first_tx 0
race 0
black 0
SEX 0
age 0
above74 0
age_cat 0
ABIRATERONE 0
enza 0
first_tx_last_day_Supply 0
days_bt_first_last_prescrib 0
first_tx_daysSupply_sum 0
fu_end_date_year 0
death 0
crcl_cat 0
albumin_cat 0
bilirubin_cat 0
hgb_cat 0
psa_cat 0
PSACAT 0
BMI_cat 0
cindex_b_Romano 0
char_cat 0
Total_Elixhauser_Groups 0
DOCETAXEL_bf_time0 0
fi_score_cat 0
dis_number 0
fi_score 0
frail 0
gleason_reviewed 0

In [62]:
import seaborn as sns
import matplotlib.pyplot as plt
In [63]:
# Check unique values in 'first_tx'
print(data['first_tx'].unique())
['ENZALUTA' 'ABIRATER']
In [64]:
# Mapping categorical values to numeric
data['first_tx'] = data['first_tx'].map({'ENZALUTA': 1, 'ABIRATER': 0})
data.head()
Out[64]:
time0_year first_tx race black SEX age above74 age_cat ABIRATERONE enza ... BMI_cat cindex_b_Romano char_cat Total_Elixhauser_Groups DOCETAXEL_bf_time0 fi_score_cat dis_number fi_score frail gleason_reviewed
0 2017 1 White 0 M 70 0 3 0 1 ... 2 2 1 3 0 2. pre-f 4 0.129032 0 9
1 2015 0 White 0 M 68 0 2 1 0 ... 4 4 2 7 0 1. non-f 3 0.096774 0 7
2 2015 0 White 0 M 71 0 3 1 0 ... 4 3 1 5 0 1. non-f 3 0.096774 0
3 2015 0 White 0 M 79 1 3 1 0 ... 2 7 3 10 0 3. mild 9 0.290323 1 9
4 2016 0 White 0 M 90 1 5 1 0 ... 4 3 1 6 0 4. moder 10 0.322581 1

5 rows × 31 columns

In [65]:
# Check unique values in 'race'
print(data['race'].unique())
['White' 'Black' 'Other' 'Unknown']
In [66]:
# Mapping categorical values to numeric
data['race'] = data['race'].map({'White': 1, 'Black' : 0, 'Other': 3, 'Unknown': 4})
data.head()
Out[66]:
time0_year first_tx race black SEX age above74 age_cat ABIRATERONE enza ... BMI_cat cindex_b_Romano char_cat Total_Elixhauser_Groups DOCETAXEL_bf_time0 fi_score_cat dis_number fi_score frail gleason_reviewed
0 2017 1 1 0 M 70 0 3 0 1 ... 2 2 1 3 0 2. pre-f 4 0.129032 0 9
1 2015 0 1 0 M 68 0 2 1 0 ... 4 4 2 7 0 1. non-f 3 0.096774 0 7
2 2015 0 1 0 M 71 0 3 1 0 ... 4 3 1 5 0 1. non-f 3 0.096774 0
3 2015 0 1 0 M 79 1 3 1 0 ... 2 7 3 10 0 3. mild 9 0.290323 1 9
4 2016 0 1 0 M 90 1 5 1 0 ... 4 3 1 6 0 4. moder 10 0.322581 1

5 rows × 31 columns

In [67]:
# Check unique values in 'sex'
print(data['SEX'].unique())
['M']
In [68]:
data.drop('SEX', axis=1, inplace=True)
In [69]:
data.head()
Out[69]:
time0_year first_tx race black age above74 age_cat ABIRATERONE enza first_tx_last_day_Supply ... BMI_cat cindex_b_Romano char_cat Total_Elixhauser_Groups DOCETAXEL_bf_time0 fi_score_cat dis_number fi_score frail gleason_reviewed
0 2017 1 1 0 70 0 3 0 1 30 ... 2 2 1 3 0 2. pre-f 4 0.129032 0 9
1 2015 0 1 0 68 0 2 1 0 90 ... 4 4 2 7 0 1. non-f 3 0.096774 0 7
2 2015 0 1 0 71 0 3 1 0 30 ... 4 3 1 5 0 1. non-f 3 0.096774 0
3 2015 0 1 0 79 1 3 1 0 28 ... 2 7 3 10 0 3. mild 9 0.290323 1 9
4 2016 0 1 0 90 1 5 1 0 30 ... 4 3 1 6 0 4. moder 10 0.322581 1

5 rows × 30 columns

In [70]:
# Check unique values in 'fi_score_cat'
print(data['fi_score_cat'].unique())
['2. pre-f' '1. non-f' '3. mild' '4. moder' '5. sever']
In [71]:
# Mapping categorical values to numeric
data['fi_score_cat'] = data['fi_score_cat'].map({'1. non-f': 0, '2. pre-f' : 1, '3. mild': 2, '4. moder': 3,'5. sever':5})
data.head()
Out[71]:
time0_year first_tx race black age above74 age_cat ABIRATERONE enza first_tx_last_day_Supply ... BMI_cat cindex_b_Romano char_cat Total_Elixhauser_Groups DOCETAXEL_bf_time0 fi_score_cat dis_number fi_score frail gleason_reviewed
0 2017 1 1 0 70 0 3 0 1 30 ... 2 2 1 3 0 1 4 0.129032 0 9
1 2015 0 1 0 68 0 2 1 0 90 ... 4 4 2 7 0 0 3 0.096774 0 7
2 2015 0 1 0 71 0 3 1 0 30 ... 4 3 1 5 0 0 3 0.096774 0
3 2015 0 1 0 79 1 3 1 0 28 ... 2 7 3 10 0 2 9 0.290323 1 9
4 2016 0 1 0 90 1 5 1 0 30 ... 4 3 1 6 0 3 10 0.322581 1

5 rows × 30 columns

In [72]:
# Check unique values in 'psa_cat'
print(data['psa_cat'].unique())
['Cat5. 50' 'Cat2. 4' 'Cat3. 10' 'Cat4. 20' 'Cat7. 20' 'Cat1. 0'
 'Cat6. 10' 'Unknown']
In [73]:
# Mapping categorical values to numeric
data['psa_cat'] = data['psa_cat'].map({'Cat5. 50':5, 'Cat2. 4':2, 'Cat3. 10':3, 'Cat4. 20':4, 'Cat7. 20':7, 'Cat1. 0':1,
 'Cat6. 10':6, 'Unknown':0})
data.head()
Out[73]:
time0_year first_tx race black age above74 age_cat ABIRATERONE enza first_tx_last_day_Supply ... BMI_cat cindex_b_Romano char_cat Total_Elixhauser_Groups DOCETAXEL_bf_time0 fi_score_cat dis_number fi_score frail gleason_reviewed
0 2017 1 1 0 70 0 3 0 1 30 ... 2 2 1 3 0 1 4 0.129032 0 9
1 2015 0 1 0 68 0 2 1 0 90 ... 4 4 2 7 0 0 3 0.096774 0 7
2 2015 0 1 0 71 0 3 1 0 30 ... 4 3 1 5 0 0 3 0.096774 0
3 2015 0 1 0 79 1 3 1 0 28 ... 2 7 3 10 0 2 9 0.290323 1 9
4 2016 0 1 0 90 1 5 1 0 30 ... 4 3 1 6 0 3 10 0.322581 1

5 rows × 30 columns

In [74]:
import pandas as pd

# Assuming df is your DataFrame
data['gleason_reviewed'] = data['gleason_reviewed'].fillna(0)
In [75]:
data.isnull().sum()
Out[75]:
0
time0_year 0
first_tx 0
race 0
black 0
age 0
above74 0
age_cat 0
ABIRATERONE 0
enza 0
first_tx_last_day_Supply 0
days_bt_first_last_prescrib 0
first_tx_daysSupply_sum 0
fu_end_date_year 0
death 0
crcl_cat 0
albumin_cat 0
bilirubin_cat 0
hgb_cat 0
psa_cat 0
PSACAT 0
BMI_cat 0
cindex_b_Romano 0
char_cat 0
Total_Elixhauser_Groups 0
DOCETAXEL_bf_time0 0
fi_score_cat 0
dis_number 0
fi_score 0
frail 0
gleason_reviewed 0

In [76]:
# Replace empty strings with NaN
import numpy as np
data = data.replace(' ', np.nan)
<ipython-input-76-5a78547b3815>:3: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data = data.replace(' ', np.nan)
In [77]:
import pandas as pd

# Assuming df is your DataFrame
data['gleason_reviewed'] = data['gleason_reviewed'].fillna(0)
In [78]:
# Check unique values in 'gleason_reviewed'
print(data['gleason_reviewed'].unique())
[ 9.  7.  0.  8.  6. 10.  5.  2.  4.  3.]
In [79]:
# Calculate the correlation matrix
correlation_matrix = data.corr()

# Create a heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()
In [80]:
import matplotlib.pyplot as plt
import seaborn as sns
# Set the aesthetics for the plots
sns.set(style='whitegrid')

# List of all columns in the DataFrame excluding 'death'
columns_to_visualize = data.columns.drop('death')

# Create a figure to visualize the relationship with 'death'
plt.figure(figsize=(20, 25))

# Loop through each column and create plots
for i, col in enumerate(columns_to_visualize):
    plt.subplot(6, 6, i + 1)  # Adjust the number of rows and columns for the grid
    if data[col].nunique() < 20:  # Categorical variable
        sns.countplot(x='death', hue=col, data=data, palette='viridis')
        plt.title(f'Death vs {col}')
    else:  # Numeric variable
        sns.boxplot(x='death', y=col, data=data, palette='viridis')
        plt.title(f'Death vs {col}')
    plt.xlabel('Death (1 = Yes, 0 = No)')
    plt.ylabel(col)

plt.tight_layout()
plt.show()
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
<ipython-input-80-4ca33512ba3f>:19: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='death', y=col, data=data, palette='viridis')
In [81]:
import matplotlib.pyplot as plt
pd.plotting.scatter_matrix(data, figsize=(50, 50))
plt.show()
In [82]:
print(data.columns)
Index(['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat',
       'ABIRATERONE', 'enza', 'first_tx_last_day_Supply',
       'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum',
       'fu_end_date_year', 'death', 'crcl_cat', 'albumin_cat', 'bilirubin_cat',
       'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano',
       'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0',
       'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed'],
      dtype='object')
In [83]:
import matplotlib.pyplot as plt

# Grouping the data by the features and calculating the mean of 'death'
enza_death = data.groupby('enza')['death'].mean()
abiraterone_death = data.groupby('ABIRATERONE')['death'].mean()
psa_cat_death = data.groupby('psa_cat')['death'].mean()

# Plotting death vs. enza
plt.figure(figsize=(12, 6))
plt.plot(enza_death.index, enza_death.values, marker='o', label='Death vs Enza')
plt.title('Line Graph: Death vs Enza')
plt.xlabel('Enza (0 or 1)')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()

# Plotting death vs. ABIRATERONE
plt.figure(figsize=(12, 6))
plt.plot(abiraterone_death.index, abiraterone_death.values, marker='o', color='orange', label='Death vs ABIRATERONE')
plt.title('Line Graph: Death vs ABIRATERONE')
plt.xlabel('ABIRATERONE (0 or 1)')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()

# Plotting death vs. psa_cat
plt.figure(figsize=(12, 6))
plt.plot(psa_cat_death.index, psa_cat_death.values, marker='o', color='green', label='Death vs PSA Category')
plt.title('Line Graph: Death vs PSA Category')
plt.xlabel('PSA Category')
plt.ylabel('Mean Death')
plt.legend()
plt.grid()
plt.show()

LINEAR REGRESSION

In [96]:
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix

#Selecting required attributes for analysis
X = data[['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat',
       'ABIRATERONE', 'enza', 'first_tx_last_day_Supply',
       'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum',
       'fu_end_date_year', 'crcl_cat', 'albumin_cat', 'bilirubin_cat',
       'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano',
       'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0',
       'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed']]
y = data['death']  # Target variable

#Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

#Fitting a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

#Making predictions on the test set
y_pred = model.predict(X_test)
# Printing the results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
Training Accuracy: 0.9742323384152888
Test Accuracy: 0.967381974248927

LOGISTIC REGRESSION

In [87]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Selecting required attributes for analysis
X = data[['time0_year', 'first_tx', 'race', 'black', 'age', 'above74', 'age_cat',
       'ABIRATERONE', 'enza', 'first_tx_last_day_Supply',
       'days_bt_first_last_prescrib', 'first_tx_daysSupply_sum',
       'fu_end_date_year', 'crcl_cat', 'albumin_cat', 'bilirubin_cat',
       'hgb_cat', 'psa_cat', 'PSACAT', 'BMI_cat', 'cindex_b_Romano',
       'char_cat', 'Total_Elixhauser_Groups', 'DOCETAXEL_bf_time0',
       'fi_score_cat', 'dis_number', 'fi_score', 'frail', 'gleason_reviewed']]  # Selecting features
y = data['death']  # Target variable

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Fitting a Logistic Regression model
model = LogisticRegression(max_iter=1000)  # You can increase max_iter if convergence issues arise
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Evaluating the model
test_accuracy = accuracy_score(y_test, y_pred_test)
train_accuracy = accuracy_score(y_train, y_pred_train)

# Printing the results
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

# Confusion matrix for the test data
conf_matrix = confusion_matrix(y_test, y_pred_test)
classification_rep = classification_report(y_test, y_pred_test)

print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)
Training Accuracy: 0.8937083959630664
Test Accuracy: 0.8755364806866953
Confusion Matrix:
 [[186  86]
 [ 59 834]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.68      0.72       272
           1       0.91      0.93      0.92       893

    accuracy                           0.88      1165
   macro avg       0.83      0.81      0.82      1165
weighted avg       0.87      0.88      0.87      1165

/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [88]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Initialize a StandardScaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit logistic regression with scaled data
log_reg_model = LogisticRegression(max_iter=10000000)
log_reg_model.fit(X_train_scaled, y_train)

# Cross-validation
cross_val_scores = cross_val_score(log_reg_model, X_train_scaled, y_train, cv=5)
print("Cross-validation scores:", cross_val_scores)
print("Mean cross-validation score:", cross_val_scores.mean())

# Predictions on test data
y_pred = log_reg_model.predict(X_test_scaled)

# Evaluate accuracy
train_accuracy = log_reg_model.score(X_train_scaled, y_train)
test_accuracy = log_reg_model.score(X_test_scaled, y_test)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)
Cross-validation scores: [0.97532189 0.972103   0.97099893 0.97851772 0.97207304]
Mean cross-validation score: 0.973802916242169
Training Accuracy: 0.9742323384152888
Test Accuracy: 0.9682403433476395
In [89]:
log_reg_model = LogisticRegression(solver='saga', max_iter=10000000)
log_reg_model.fit(X_train_scaled, y_train)
Out[89]:
LogisticRegression(max_iter=10000000, solver='saga')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(max_iter=10000000, solver='saga')
In [91]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)

print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean()}')
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Cross-validation scores: [0.8832618  0.89613734 0.88745704 0.89690722 0.89261168]
Mean cross-validation score: 0.8912750173296168
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [92]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and fit a Random Forest model
rf_model = RandomForestClassifier()
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5)
print(f'Cross-validation scores for Random Forest: {cv_scores_rf}')
print(f'Mean cross-validation score for Random Forest: {cv_scores_rf.mean()}')
Cross-validation scores for Random Forest: [0.9751073  0.97339056 0.97164948 0.97508591 0.96563574]
Mean cross-validation score for Random Forest: 0.9721737976195743
In [93]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X, y)

# Best parameters and score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Best cross-validation score: 0.9730324616904857

the grid search successfully completed, and we found the best parameters for the Random Forest model:

Best Parameters:

bootstrap: True max_depth: 10 min_samples_leaf: 4 min_samples_split: 2 n_estimators: 100 Best Cross-Validation Score: 0.973

RANDOM FOREST

In [94]:
# Train the final Random Forest model with the best parameters
rf_best = RandomForestClassifier(
    bootstrap=True,
    max_depth=10,
    min_samples_leaf=4,
    min_samples_split=2,
    n_estimators=100,
    random_state=42
)

# Fit the model on the training data
rf_best.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_best.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

test_accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Test Accuracy: {test_accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')
Test Accuracy: 0.967381974248927
Confusion Matrix:
[[270   2]
 [ 36 857]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       272
           1       1.00      0.96      0.98       893

    accuracy                           0.97      1165
   macro avg       0.94      0.98      0.96      1165
weighted avg       0.97      0.97      0.97      1165

The Random Forest model has performed quite well on the test set:

Test Accuracy: 96.7%, indicating the model correctly classified about 97% of the instances. Confusion Matrix: True Negatives (correctly predicted 0): 270 False Positives (incorrectly predicted 1): 2 False Negatives (incorrectly predicted 0): 36 True Positives (correctly predicted 1): 857 Classification Report: Class 0 (Non-survivor): Precision: 0.88 (88% of instances predicted as class 0 were correct) Recall: 0.99 (99% of actual class 0 instances were correctly identified) F1-score: 0.93 (harmonic mean of precision and recall) Class 1 (Survivor): Precision: 1.00 (100% of instances predicted as class 1 were correct) Recall: 0.96 (96% of actual class 1 instances were correctly identified) F1-score: 0.98 The model has a high precision and recall, especially for class 1, suggesting it is very effective at predicting survival. It might be worth exploring feature importance or partial dependence plots to understand which features contribute most to the predictions.

In [95]:
#PREDICTING THE MODEL
import pandas as pd
import numpy as np


# Select a specific row of data (e.g., the first row)
input_data = data.iloc[0, :29].values

# Convert the selected row to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array for prediction (only one instance)
data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Make the prediction using the model
prediction = model.predict(data_reshaped)

# Output the prediction result
if prediction[0] == 0:
    print('The Person did not die due to prostate cancer')
else:
    print('The Person has died due to prostate cancer')
The Person has died due to prostate cancer
/usr/local/lib/python3.10/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(
In [104]:
# Importing the necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming 'data' is the DataFrame that contains the dataset, and 'death' is the target variable
X = data.drop(columns=['death'])
y = data['death']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Standardizing the features for KNN
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 1. K-Nearest Neighbors (KNN) Model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Predictions and Evaluation for KNN
knn_predictions = knn_model.predict(X_test_scaled)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_conf_matrix = confusion_matrix(y_test, knn_predictions)
knn_class_report = classification_report(y_test, knn_predictions)

# Calculate training accuracy
train_predictions = knn_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, knn_predictions)

# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("KNN Model Evaluation")
print("Accuracy:", knn_accuracy)
print("Confusion Matrix:\n", knn_conf_matrix)
print("Classification Report:\n", knn_class_report)
Training Accuracy: 0.9233412067854843
Test Accuracy: 0.8721030042918455
KNN Model Evaluation
Accuracy: 0.8721030042918455
Confusion Matrix:
 [[156 116]
 [ 33 860]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.57      0.68       272
           1       0.88      0.96      0.92       893

    accuracy                           0.87      1165
   macro avg       0.85      0.77      0.80      1165
weighted avg       0.87      0.87      0.86      1165

In [103]:
# 2. Naive Bayes Model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predictions and Evaluation for Naive Bayes
nb_predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_conf_matrix = confusion_matrix(y_test, nb_predictions)
nb_class_report = classification_report(y_test, nb_predictions)

# Calculate training accuracy
train_predictions = nb_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, nb_predictions)

# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("\nNaive Bayes Model Evaluation")
print("Accuracy:", nb_accuracy)
print("Confusion Matrix:\n", nb_conf_matrix)
print("Classification Report:\n", nb_class_report)
Training Accuracy: 0.789778827571398
Test Accuracy: 0.9622317596566523

Naive Bayes Model Evaluation
Accuracy: 0.9622317596566523
Confusion Matrix:
 [[260  12]
 [ 32 861]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.96      0.92       272
           1       0.99      0.96      0.98       893

    accuracy                           0.96      1165
   macro avg       0.94      0.96      0.95      1165
weighted avg       0.96      0.96      0.96      1165

/usr/local/lib/python3.10/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but GaussianNB was fitted with feature names
  warnings.warn(
In [102]:
# Importing the necessary libraries
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


X = data.drop(columns=['death'])
y = data['death']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Standardizing the features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM Model with a linear kernel
svm_model = SVC(kernel='linear', C=1.0, random_state=50)
svm_model.fit(X_train_scaled, y_train)

# Predictions and Evaluation for SVM
svm_predictions = svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_conf_matrix = confusion_matrix(y_test, svm_predictions)
svm_class_report = classification_report(y_test, svm_predictions)

# Calculate training accuracy
train_predictions = svm_model.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, train_predictions)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, svm_predictions)

# Display training and test accuracy
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("SVM Model Evaluation")
print("Accuracy:", svm_accuracy)
print("Confusion Matrix:\n", svm_conf_matrix)
print("Classification Report:\n", svm_class_report)
Training Accuracy: 0.9740176079020829
Test Accuracy: 0.9682403433476395
SVM Model Evaluation
Accuracy: 0.9682403433476395
Confusion Matrix:
 [[272   0]
 [ 37 856]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94       272
           1       1.00      0.96      0.98       893

    accuracy                           0.97      1165
   macro avg       0.94      0.98      0.96      1165
weighted avg       0.97      0.97      0.97      1165

In [99]:
#@title Convert ipynb to HTML in Colab
# Upload ipynb
from google.colab import files
f = files.upload()

# Convert ipynb to html
import subprocess
file0 = list(f.keys())[0]
_ = subprocess.run(["pip", "install", "nbconvert"])
_ = subprocess.run(["jupyter", "nbconvert", file0, "--to", "html"])

# download the html
files.download(file0[:-5]+"html")
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving capstone (1).ipynb to capstone (1).ipynb